# Standard imports. Note: You must pip install nasdaqdatalink 1st
import os
import pandas as pd
import hvplot.pandas
from pathlib import Path
# For API Calls
import nasdaqdatalink
# Do we need requests?
import requests
# For opening zip folder
import shutil
# For technical analysis
import pandas_ta as ta
# Linking my API key to .env in the same folder. The key is stored in the folder without any quotations around it
nasdaqdatalink.read_key(filename=".env")
# A function to retrieve a dataframe of counties, zips, etc
def get_regions(regions):
region_df=nasdaqdatalink.get_table('ZILLOW/REGIONS', region_type=regions)
return region_df
This contains a list of all counties in the US.
# Using get_regions to retrieve a list of counties
region_df = get_regions('county')
region_df[["county", "state"]] = region_df["region"].str.split(';', 1, expand=True)
region_df["state"] = region_df["state"].str.split(';', 1, expand=True)[0]
#
# Clean up regions data
# Remove ' County' so that we can match the Zillow data with Wikipedia data.
region_df["county"] = region_df["county"].str.replace(" County", "")
# Remove the leading blank space from the 'state' column.
region_df["state"] = region_df['state'].str[1:]
# Clean up region_id datatype.
region_df['region_id']=region_df['region_id'].astype(int)
# Check data for region_df
print(region_df.head())
print(region_df.tail())
| region_id | region_type | region | county | state | |
|---|---|---|---|---|---|
| None | |||||
| 0 | 999 | county | Durham County; NC; Durham-Chapel Hill | Durham | NC |
| 1 | 998 | county | Duplin County; NC | Duplin | NC |
| 2 | 997 | county | Dubois County; IN; Jasper | Dubois | IN |
| 3 | 995 | county | Donley County; TX | Donley | TX |
| 4 | 993 | county | Dimmit County; TX | Dimmit | TX |
| region_id | region_type | region | county | state | |
|---|---|---|---|---|---|
| None | |||||
| 2886 | 1003 | county | Elmore County; AL; Montgomery | Elmore | AL |
| 2887 | 1002 | county | Elbert County; GA | Elbert | GA |
| 2888 | 1001 | county | Elbert County; CO; Denver-Aurora-Lakewood | Elbert | CO |
| 2889 | 1000 | county | Echols County; GA; Valdosta | Echols | GA |
| 2890 | 100 | county | Bibb County; AL; Birmingham-Hoover | Bibb | AL |
In this example, we read in Zillow sales data in the form of a CSV file.
# Get the Zillow sales data.
# The actual API call using the SDK.
# Instructions can be found here https://data.nasdaq.com/databases/ZILLOW/usage/quickstart/python
# Replace 'quandl' w/ 'nasdaqdatalink
# Turned into a function to prevent constant re-downloading massive csv
def get_zillow_data():
data = nasdaqdatalink.export_table('ZILLOW/DATA', indicator_id='ZSFH', region_id=list(region_df['region_id']),filename='db.zip')
# Unzipping database from API call
shutil.unpack_archive('db.zip')
return data
# Reading in Database
zillow_data=pd.read_csv(
Path('ZILLOW_DATA_d5d2ff90eb7172dbde848ea36de12dfe.csv')
)
# Check the Zillow sales data
print(zillow_data.head())
print(zillow_data.tail())
| indicator_id | region_id | date | value | |
|---|---|---|---|---|
| 0 | ZSFH | 100 | 2007-11-30 | 123760.0 |
| 1 | ZSFH | 100 | 2007-12-31 | 123754.0 |
| 2 | ZSFH | 100 | 2008-01-31 | 123605.0 |
| 3 | ZSFH | 100 | 2008-02-29 | 123393.0 |
| 4 | ZSFH | 100 | 2008-03-31 | 123095.0 |
| indicator_id | region_id | date | value | |
|---|---|---|---|---|
| 669311 | ZSFH | 999 | 2022-02-28 | 390111.0 |
| 669312 | ZSFH | 999 | 2022-03-31 | 401621.0 |
| 669313 | ZSFH | 999 | 2022-04-30 | 411421.0 |
| 669314 | ZSFH | 999 | 2022-05-31 | 422028.0 |
| 669315 | ZSFH | 999 | 2022-06-30 | 430509.0 |
## Merge the Region dataframe with the Zillow sales data
zillow_merge_df = pd.merge(region_df, zillow_data, on=['region_id'])
# Check the merged Zillow data
zillow_merge_df.head()
| region_id | region_type | region | county | state | indicator_id | date | value | |
|---|---|---|---|---|---|---|---|---|
| 0 | 999 | county | Durham County; NC; Durham-Chapel Hill | Durham | NC | ZSFH | 1997-02-28 | 139430.0 |
| 1 | 999 | county | Durham County; NC; Durham-Chapel Hill | Durham | NC | ZSFH | 1997-03-31 | 139459.0 |
| 2 | 999 | county | Durham County; NC; Durham-Chapel Hill | Durham | NC | ZSFH | 1997-04-30 | 139659.0 |
| 3 | 999 | county | Durham County; NC; Durham-Chapel Hill | Durham | NC | ZSFH | 1997-05-31 | 139887.0 |
| 4 | 999 | county | Durham County; NC; Durham-Chapel Hill | Durham | NC | ZSFH | 1997-06-30 | 140303.0 |
We couldn't find the county coordinates from Zillow, so we sourced the data from Wikipedia. We are going to have to merge the data with Zillow based on county and state.
# Read in county data with coordinates
county_coordinates_df=pd.read_csv(
Path('counties_w_coordinates.csv')
)
# Clean up data.
# We need to rename the columns so that we can merge our Zillow data set
# with the county coordinates data. The dataframes will be merged against 'county' and 'state'.
county_coordinates_df = county_coordinates_df.rename(columns={"County\xa0[2]" : "county"})
# county_coordinates_df = county_coordinates_df.rename(columns={"region" : "region"})
county_coordinates_df = county_coordinates_df.rename(columns={"State" : "state"})
# Remove degrees
county_coordinates_df["Latitude"] = county_coordinates_df["Latitude"].str.replace("°", "")
county_coordinates_df["Longitude"] = county_coordinates_df["Longitude"].str.replace("°", "")
# Remove + sign for Latitude and Longitude
county_coordinates_df["Latitude"] = county_coordinates_df["Latitude"].str.replace("+", "")
county_coordinates_df["Longitude"] = county_coordinates_df["Longitude"].str.replace("+", "")
# Some of the data uses unicode hyphens which causes problems when trying to convert the Longitude and Latitude to float.
county_coordinates_df["Latitude"] = county_coordinates_df["Latitude"].str.replace('\U00002013', '-')
county_coordinates_df["Longitude"] = county_coordinates_df["Longitude"].str.replace('\U00002013', '-')
# Convert Longitude and Latitude to float so we can display on the map.
county_coordinates_df["Latitude"] = county_coordinates_df["Latitude"].astype(float)
county_coordinates_df["Longitude"] = county_coordinates_df["Longitude"].astype(float)
# Check the county coordinates data
county_coordinates_df.head()
C:\Users\gtkhh\anaconda3\envs\dev\lib\site-packages\ipykernel_launcher.py:18: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True. C:\Users\gtkhh\anaconda3\envs\dev\lib\site-packages\ipykernel_launcher.py:19: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
| Sort [1] | state | FIPS | county | County Seat(s) [3] | Population | Land Area | Land Area.1 | Water Area | Water Area.1 | Total Area | Total Area.1 | Latitude | Longitude | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | NaN | NaN | NaN | -2010 | km² | mi² | km² | mi² | km² | mi² | NaN | NaN |
| 1 | 1.0 | AL | 1001.0 | Autauga | Prattville | 54,571 | 1,539.58 | 594.436 | 25.776 | 9.952 | 1,565.36 | 604.388 | 32.536382 | -86.644490 |
| 2 | 2.0 | AL | 1003.0 | Baldwin | Bay Minette | 182,265 | 4,117.52 | 1,589.78 | 1,133.19 | 437.527 | 5,250.71 | 2,027.31 | 30.659218 | -87.746067 |
| 3 | 3.0 | AL | 1005.0 | Barbour | Clayton | 27,457 | 2,291.82 | 884.876 | 50.865 | 19.639 | 2,342.68 | 904.515 | 31.870670 | -85.405456 |
| 4 | 4.0 | AL | 1007.0 | Bibb | Centreville | 22,915 | 1,612.48 | 622.582 | 9.289 | 3.587 | 1,621.77 | 626.169 | 33.015893 | -87.127148 |
# Merge the Zillow data and county coordinates data.
master_df = pd.merge(zillow_merge_df, county_coordinates_df, on=['county', 'state'])
master_df['date']=pd.to_datetime(master_df['date'])
# Check the master data
master_df
| region_id | region_type | region | county | state | indicator_id | date | value | Sort [1] | FIPS | County Seat(s) [3] | Population | Land Area | Land Area.1 | Water Area | Water Area.1 | Total Area | Total Area.1 | Latitude | Longitude | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 999 | county | Durham County; NC; Durham-Chapel Hill | Durham | NC | ZSFH | 1997-02-28 | 139430.0 | 1922.0 | 37063.0 | Durham | 267,587 | 740.673 | 285.975 | 30.798 | 11.891 | 771.471 | 297.866 | 36.036589 | -78.877919 |
| 1 | 999 | county | Durham County; NC; Durham-Chapel Hill | Durham | NC | ZSFH | 1997-03-31 | 139459.0 | 1922.0 | 37063.0 | Durham | 267,587 | 740.673 | 285.975 | 30.798 | 11.891 | 771.471 | 297.866 | 36.036589 | -78.877919 |
| 2 | 999 | county | Durham County; NC; Durham-Chapel Hill | Durham | NC | ZSFH | 1997-04-30 | 139659.0 | 1922.0 | 37063.0 | Durham | 267,587 | 740.673 | 285.975 | 30.798 | 11.891 | 771.471 | 297.866 | 36.036589 | -78.877919 |
| 3 | 999 | county | Durham County; NC; Durham-Chapel Hill | Durham | NC | ZSFH | 1997-05-31 | 139887.0 | 1922.0 | 37063.0 | Durham | 267,587 | 740.673 | 285.975 | 30.798 | 11.891 | 771.471 | 297.866 | 36.036589 | -78.877919 |
| 4 | 999 | county | Durham County; NC; Durham-Chapel Hill | Durham | NC | ZSFH | 1997-06-30 | 140303.0 | 1922.0 | 37063.0 | Durham | 267,587 | 740.673 | 285.975 | 30.798 | 11.891 | 771.471 | 297.866 | 36.036589 | -78.877919 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 628666 | 100 | county | Bibb County; AL; Birmingham-Hoover | Bibb | AL | ZSFH | 2022-02-28 | 161462.0 | 4.0 | 1007.0 | Centreville | 22,915 | 1,612.48 | 622.582 | 9.289 | 3.587 | 1,621.77 | 626.169 | 33.015893 | -87.127148 |
| 628667 | 100 | county | Bibb County; AL; Birmingham-Hoover | Bibb | AL | ZSFH | 2022-03-31 | 162369.0 | 4.0 | 1007.0 | Centreville | 22,915 | 1,612.48 | 622.582 | 9.289 | 3.587 | 1,621.77 | 626.169 | 33.015893 | -87.127148 |
| 628668 | 100 | county | Bibb County; AL; Birmingham-Hoover | Bibb | AL | ZSFH | 2022-04-30 | 163859.0 | 4.0 | 1007.0 | Centreville | 22,915 | 1,612.48 | 622.582 | 9.289 | 3.587 | 1,621.77 | 626.169 | 33.015893 | -87.127148 |
| 628669 | 100 | county | Bibb County; AL; Birmingham-Hoover | Bibb | AL | ZSFH | 2022-05-31 | 164684.0 | 4.0 | 1007.0 | Centreville | 22,915 | 1,612.48 | 622.582 | 9.289 | 3.587 | 1,621.77 | 626.169 | 33.015893 | -87.127148 |
| 628670 | 100 | county | Bibb County; AL; Birmingham-Hoover | Bibb | AL | ZSFH | 2022-06-30 | 166720.0 | 4.0 | 1007.0 | Centreville | 22,915 | 1,612.48 | 622.582 | 9.289 | 3.587 | 1,621.77 | 626.169 | 33.015893 | -87.127148 |
628671 rows × 20 columns
# Get mean data by state and county
county_df = master_df.groupby(["state", "county"]).mean()
# Divide price by 1000 so that it looks better on map.
county_df["value"] = county_df["value"] / 1000
# Check data
print(county_df.head())
print(county_df.tail())
| region_id | value | Sort [1] | FIPS | Latitude | Longitude | ||
|---|---|---|---|---|---|---|---|
| state | county | ||||||
| AL | Autauga | 1524.0 | 144.376453 | 1.0 | 1001.0 | 32.536382 | -86.644490 |
| Baldwin | 1525.0 | 172.793462 | 2.0 | 1003.0 | 30.659218 | -87.746067 | |
| Barbour | 1531.0 | 88.204607 | 3.0 | 1005.0 | 31.870670 | -85.405456 | |
| Bibb | 100.0 | 117.338341 | 4.0 | 1007.0 | 33.015893 | -87.127148 | |
| Blount | 883.0 | 126.394141 | 5.0 | 1009.0 | 33.977448 | -86.567246 |
| region_id | value | Sort [1] | FIPS | Latitude | Longitude | ||
|---|---|---|---|---|---|---|---|
| state | county | ||||||
| WY | Sweetwater | 3036.0 | 217.047598 | 3139.0 | 56037.0 | 41.660339 | -108.875676 |
| Teton | 745.0 | 850.719510 | 3140.0 | 56039.0 | 44.049321 | -110.588102 | |
| Uinta | 760.0 | 183.118510 | 3141.0 | 56041.0 | 41.284726 | -110.558947 | |
| Washakie | 2593.0 | 162.015983 | 3142.0 | 56043.0 | 43.878831 | -107.669052 | |
| Weston | 1486.0 | 169.969867 | 3143.0 | 56045.0 | 43.846213 | -104.570020 |
county_df.hvplot.points(
'Longitude',
'Latitude',
geo=True,
size='value',
color='value',
tiles='OSM',
height=700,
width=1200)
# Creates a DataFrame using only the columns we are interested in
filtered_df = master_df[['date','county','state','value']]
filtered_df['county'] = filtered_df['county'] + ", " + filtered_df['state']
drop_cols = ['state']
filtered_df = filtered_df.drop(columns=drop_cols)
C:\Users\gtkhh\anaconda3\envs\dev\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy """
# Figured out the change in number of counties was messing up the charts
exploratory_df=filtered_df.groupby('date').count()
# Create new DataFrame with summed county markets to represent the entire nation
nationwide_df = filtered_df.groupby(filtered_df['date']).agg({'value':'sum'})
# Must divide 'values' by number of counties that make up said value so data isn't skewed by county number
nationwide_df['avg'] = nationwide_df['value']/exploratory_df['county']
# Define a function for getting a nationwide MACD indicator using pandas_ta
def get_nationwide_macd(fast, slow, signal):
nationwide_macd_df = nationwide_df.ta.macd(close='avg', fast=fast, slow=slow, signal=signal, append=True)
# Making DataFrame look nice
nationwide_macd_df = nationwide_macd_df.rename(columns={f'MACD_{fast}_{slow}_{signal}':'fast_ema',f'MACDh_{fast}_{slow}_{signal}':'signal',f'MACDs_{fast}_{slow}_{signal}':'slow_ema'}).dropna()
# Divide by 1000 so it looks more like a momentum indicator
nationwide_macd_df = nationwide_macd_df/1000
return nationwide_macd_df
# Use newly defined funtion
nationwide_macd_df = get_nationwide_macd(6, 12, 4)
# Graph
nationwide_macd_df.hvplot(title='US Housing Market Momentum', ylabel='Momentum')
# Show mean housing price in county
filtered_df.hvplot(title='Mean Single Famiy Home Price',groupby='county', x='date', yformatter='%.0f')
# Define a function for getting a county-specific MACD indicator using pandas_ta
def get_county_macd(fast, slow, signal):
county_macd_df=filtered_df.copy()
county_macd_df.ta.macd(close='value', fast=fast, slow=slow, signal=signal, append=True)
# Making DataFrame look nice
county_macd_df = county_macd_df.rename(columns={f'MACD_{fast}_{slow}_{signal}':'fast_ema',f'MACDh_{fast}_{slow}_{signal}':'signal',f'MACDs_{fast}_{slow}_{signal}':'slow_ema'}).dropna()
county_macd_df = county_macd_df.drop(columns='value').set_index('date')
county_macd_df[['fast_ema','signal','slow_ema']] = county_macd_df[['fast_ema','signal','slow_ema']]/1000
return county_macd_df
# Use newly defined function
county_macd_df=get_county_macd(6,12,4)
county_macd_df.hvplot(title='MAC/D by County', groupby='county', x='date', ylabel='Momentum')